#!/usr/bin/env python3
#
# This script generates constants for efficient computation of the gzip CRC-32.

import sys

# This is the generator polynomial G(x) of the gzip CRC-32, represented as an
# int using the natural mapping between bits and polynomial coefficients.
G = 0x104c11db7

# XOR (add) an iterable of polynomials.
def xor(iterable):
    res = 0
    for val in iterable:
        res ^= val
    return res

# Multiply two polynomials.
def clmul(a, b):
    return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)

# Polynomial division floor(a / b).
def div(a, b):
    q = 0
    while a.bit_length() >= b.bit_length():
        q ^= 1 << (a.bit_length() - b.bit_length())
        a ^= b << (a.bit_length() - b.bit_length())
    return q

# Reduce the polynomial 'a' modulo the polynomial 'b'.
def reduce(a, b):
    return a ^ clmul(div(a, b), b)

# Reverse the bits of a polynomial.
def bitreverse(poly, num_bits):
    return xor(1 << (num_bits - 1 - i) for i in range(num_bits)
               if (poly & (1 << i)) != 0)

# Compute x^d mod G.
def x_to_the_d(d):
    if d < G.bit_length() - 1:
        return 1 << d
    t = x_to_the_d(d//2)
    t = clmul(t, t)
    if d % 2 != 0:
        t <<= 1
    return reduce(t, G)

def gen_tables():
    print('/*')
    print(' * crc32_tables.h - data tables for CRC-32 computation')
    print(' *')
    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
    print(' */')
    for n in [1, 8]:
        print('')
        print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{')
        # The i'th table entry is the CRC-32 of the message consisting of byte
        # i % 256 followed by i // 256 zero bytes.
        polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)]
        polys = [bitreverse(reduce(poly, G), 32) for poly in polys]
        for i in range(0, len(polys), 4):
            print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},')
        print('};')

# Compute the constant multipliers needed for "folding" over various distances
# with the gzip CRC-32.  Each such multiplier is x^d mod G(x) for some distance
# d, in bits, over which the folding is occurring.
#
# Folding works as follows: let A(x) be a polynomial (possibly reduced partially
# or fully mod G(x)) for part of the message, and let B(x) be a polynomial
# (possibly reduced partially or fully mod G(x)) for a later part of the
# message.  The unreduced combined polynomial is A(x)*x^d + B(x), where d is the
# number of bits separating the two parts of the message plus len(B(x)).  Since
# mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used
# instead of x^d unreduced.  That allows the combined polynomial to be computed
# relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with
# length max(len(A(x)) + 31, len(B(x))).  This does require doing a polynomial
# multiplication (carryless multiplication).
#
# "Folding" in this way can be used for the entire CRC computation except the
# final reduction to 32 bits; this works well when CPU support for carryless
# multiplication is available.  It can also be used to combine CRCs of different
# parts of the message that were computed using a different method.
#
# Note that the gzip CRC-32 uses bit-reversed polynomials.  I.e., the low order
# bits are really the high order polynomial coefficients.
def gen_multipliers():
    print('/*')
    print(' * crc32_multipliers.h - constants for CRC-32 folding')
    print(' *')
    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
    print(' */')
    print('')

    # Compute the multipliers needed for CRC-32 folding with carryless
    # multiplication instructions that operate on the 64-bit halves of 128-bit
    # segments.  Using the terminology from earlier, for each 64-bit fold
    # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by
    # a 32-bit one produces a 95-bit one.  When A(x) is the low order polynomial
    # half of a 128-bit segments (high order physical half), the separation
    # between the message parts is the total length of the 128-bit segments
    # separating the values.  When A(x) is the high order polynomial half, the
    # separation is 64 bits greater.
    for i in range(1, 33):
        sep_lo = 128 * (i - 1)
        sep_hi = sep_lo + 64
        len_B = 95
        for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits)
                  sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits)
                  ]:
            poly = bitreverse(x_to_the_d(d), 32)
            print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */')
        print('')

    # Compute constants for the final 128 => 32 bit reduction.
    poly = bitreverse(div(1 << 95, G), 64)
    print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */')
    poly = bitreverse(G, 33)
    print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */')

    # Compute multipliers for combining the CRCs of separate chunks.
    print('')
    num_chunks = 4
    table_len = 129
    min_chunk_len = 128
    print(f'#define CRC32_NUM_CHUNKS {num_chunks}')
    print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL')
    print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL')
    print('')
    print('/* Multipliers for implementations that use a variable chunk length */')
    print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {')
    print('\t{ 0 /* unused row */ },')
    for i in range(1, table_len):
        chunk_len = i * min_chunk_len
        print(f'\t/* chunk_len={chunk_len} */')
        print('\t{ ', end='')
        for j in range(num_chunks - 1, 0, -1):
            d = (j * 8 * chunk_len) - 33
            poly = bitreverse(x_to_the_d(d), 32)
            print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='')
        print('},')
    print('};')
    fixed_chunk_len = 32768
    print('')
    print('/* Multipliers for implementations that use a large fixed chunk length */')
    print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL')
    for j in range(1, num_chunks):
        d = (j * 8 * fixed_chunk_len) - 33
        poly = bitreverse(x_to_the_d(d), 32)
        print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */')

with open('lib/crc32_tables.h', 'w') as f:
    sys.stdout = f
    gen_tables()
with open('lib/crc32_multipliers.h', 'w') as f:
    sys.stdout = f
    gen_multipliers()
